##################################################################
##################################################################
## Replication Material
## Stefan Müller: The Temporal Focus of Campaign Communication
## The Journal of Politics
## stefan.mueller@ucd.ie
##
## Script 1: Load and merge datasets
##################################################################
##################################################################

# Note: The file description_replication_material_jop_mueller.pdf describes the purpose of this 
# file in detail and lists the names and sources of all datasets 
# used in this script


# This script was run on the following R version, platform and OS:
# R version 3.6.0 (2019-04-26)
# Platform: Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Catalima 10.15.5

# load packages required to run this script
library(manifestoR)                   # CRAN v1.3.0
library(dplyr)                        # CRAN v1.0.0
library(readr)                        # CRAN v1.3.1
library(car)                          # CRAN v3.0-7
library(haven)                        # CRAN v2.2.0    

# load manifestoR raw data

# set API key (you need to register at the Manifesto Project and get an API key)
# Note: if you simply want to reproduce the analysis of the paper, 
# you can skip this script and continue with the next files

# you can get an API key here: https://manifesto-project.wzb.eu/information/documents/manifestoR

# mp_setapikey(key.file = "manifesto_apikey_mueller.txt") (not added to replication material)
 
# specify the corpus version used for this paper
# mp_use_corpus_version(versionid = "20191220124521")
 
#  get all manifestos (all available languages and dates)
 
# man_full <- mp_corpus(year >= "1945") %>%
#    as.data.frame(with.meta = TRUE)
# save(file = "manifestor_2019_2.RData", man_full)

# (not included in JOP Dataverse -- execute the code above after 
# getting an API key for the Manifesto Corpus and you will be able to use the 2019-2
# version of the Manifesto Corpus)
load(file = "../data_notshare/manifestor_2019_2.RData")

# new variable called "edate" which is based on date
man_full <- man_full %>% 
    mutate(edate = date)

# create a unique ID for each manifesto (used for merging at a later stage)
man_full <- man_full %>% 
    mutate(party_election_id = paste(party, edate, sep = "_"))


# load ParlGov data and wrangle/merge these files

# data with parties
parl_gov_party <- read_csv("parlgov_2018_parties.csv") %>% 
    filter(country_name %in% c("Germany", "Switzerland",
                               "Austria", "United Kingdom",
                               "Ireland", "New Zealand",
                               "Australia", "Canada"))

# cabinet data
parl_gov_cabinet <- read_csv("parlgov_2018_cabinets.csv") %>% 
    mutate(election_date = as.Date(election_date),
                  start_date = as.Date(start_date)) %>% 
    mutate(country_id = as.factor(country_id)) %>% 
    mutate(party_id = as.factor(party_id)) %>% 
    filter(country_name %in% c("Germany", "Switzerland",
                               "Austria", "United Kingdom",
                               "Ireland", "New Zealand",
                               "Australia", "Canada"))

# get last (!) cabinet observation from each party and election
# and select the relevant variables
parl_gov_cabinet <- parl_gov_cabinet %>% 
    group_by(country_name, election_date,
             party_name) %>% 
    mutate(cabinet_number = n():1) %>% 
    filter(cabinet_number == 1) %>% 
    ungroup() %>% 
    dplyr::select(country_name_short, party_id, election_date, start_date,
           cabinet_party, prime_minister)


# load ParlGov election data, make vote share numeric 
# and recode a wrong party ID
parl_gov_election <- read_csv("parlgov_2018_elections.csv") %>% 
    mutate(election_date = as.Date(election_date), 
                  seat_share = as.numeric(seats) / as.numeric(seats_total))  %>% 
    mutate(vote_share = as.numeric(vote_share)) %>% 
    mutate(party_id = car::recode(party_id, "808=1727;1180=1727")) %>% 
    filter(country_name %in% c("Germany", "Switzerland",
                               "Austria", "United Kingdom",
                               "Ireland", "New Zealand",
                               "Australia", "Canada"))


# make some variables a factor and keep only relevant variables
parl_gov_party <- parl_gov_party %>% 
    mutate(country_id = as.factor(country_id)) %>% 
    mutate(party_id = as.factor(party_id)) %>% 
    dplyr::select(country_name_short, cmp, party_id)


# remove election results from European Parliament elections ("ep")
# and remove CSU observations (we use CDU instead)
# only filter elections since 1945

parl_gov_election <- parl_gov_election %>% 
    ungroup() %>% 
    filter(election_type != "ep") %>% 
    filter(party_name_short != "CSU") %>% 
    mutate(country_id = as.factor(country_id)) %>% 
    mutate(party_id = as.factor(party_id)) %>% 
    filter(election_date > "1944-01-01") 


# merge parties with elections
parl_gov_party_election <- left_join(parl_gov_election, parl_gov_party,
                                     by = c("country_name_short", "party_id"))


# merge all data (party-election data and cabinet data)
parl_gov_complete <- left_join(parl_gov_party_election, 
                               parl_gov_cabinet,
                               by = c("country_name_short",
                                      "election_date",
                                      "party_id"))

# arrange dataset by country, party, and election date 
# and create lagged variables of cabinet_party and prime_minster
parl_gov_complete <- parl_gov_complete %>% 
    arrange(country_name, party_id, election_date) %>% 
    group_by(country_name, party_id) %>% 
    mutate(cabinet_party_lag = lag(cabinet_party),
           prime_minister_lag = lag(prime_minister)) %>% 
    ungroup() 


# include Martin Mölder's (2017) modified codes from his Electoral Studies paper
# URL to paper: https://doi.org/10.1016/j.electstud.2016.12.004
# Martin Mölder kindly provided the csv file with includes codes for merging
# ParlGov and CMP data
data_corrections <- read_csv("merge_pg_cmp.csv") %>% 
    mutate(party_id = pg_id,
                  party = cmp_id) %>% # create new variable for merging
    mutate(date = as.factor(election))


# get the unique CMP manifesto codes from manifesto dataset (party and date)
man_full_cmp_codes <- man_full %>% 
    dplyr::select(party, date) %>% 
    unique() %>% 
    mutate(date = as.factor(date))

# merge data_corrections with man_full_cmp_codes
data_corrections_combined <- full_join(data_corrections, 
                                       man_full_cmp_codes, 
                                       by = c("party", "date"))


# to merge the data, replace the "-" in the election_date variable
parl_gov_complete <- parl_gov_complete %>% 
    ungroup() %>% 
    mutate(election_year_month = as.character(gsub('.{2}$', '', election_date))) %>% # remove last two characters (day)
    mutate(election = as.character(gsub('-', '', election_year_month))) %>% 
    mutate(country = country_name) %>% 
    dplyr::select(-election_year_month)

data_corrections_combined$election <- factor(data_corrections_combined$election)
data_corrections_combined$party_id <- factor(data_corrections_combined$party_id)
parl_gov_complete$election <- factor(parl_gov_complete$election)
parl_gov_complete$party_id <- factor(parl_gov_complete$party_id)

# merge corrections with ParlGov data
parl_gov_complete_merge <- left_join(parl_gov_complete, 
                                     data_corrections_combined, 
                                     by = c("election", "country", 
                                            "party_id"))


# if the CMP ID from Martin Mölder's harmonised data is missing use the ParlGov "cmp" variable
parl_gov_complete_merge <- parl_gov_complete_merge %>% 
    mutate(cmp_id_final = ifelse(is.na(cmp_id), cmp, cmp_id)) 


# recode CDU ID (not the same because CDU and CSU are counted as two parties)
parl_gov_complete_merge <- parl_gov_complete_merge %>% 
    mutate(cmp_id_final = ifelse(
        party_name == "Christlich Demokratische Union", 41521, cmp_id_final))

# remove observations that would otehrwise result in duplicated entries
# because two parties would be assigned the same manifesto

parl_gov_complete_merge <- parl_gov_complete_merge %>% 
    mutate(party_id_election_date = paste(election_date, party_id, 
                                          sep = "_"))

# remove: Conservatives and National Liberals (1964): "1964-10-15_1496"
# remove Conservatives and National Liberals (1966): "1966-03-31_1496"
# remove: Sinn Féin (1987): "1987-02-17_2217"
# remove: Sinn Féin (1989): "1989-06-15_2217"

parl_gov_complete_merge_clean <- parl_gov_complete_merge %>% 
    filter(!party_id_election_date %in% c("1964-10-15_1496",
                                          "1966-03-31_1496",
                                          "1987-02-17_2217",
                                          "1989-06-15_2217"))

# load current CMP dataset (2019 version)
cmp_data <- readRDS("cmp_maindata_2019.rds")

# transform the coding of some variable and create lagged 
# seat shares, absolute seats, and total seats in the legislature
cmp_data <- cmp_data %>% 
    mutate(party = as.character(party)) %>% 
    mutate(date = as.character(date)) %>% 
    mutate(party_election = paste(party, edate)) %>% 
    mutate(year = substr(date, 0, 4)) %>% 
    mutate(country_election = paste(countryname, date)) %>% 
    dplyr::rename(country_id_cmp = country) %>% 
    mutate(cmp_id_final = as.factor(party)) %>% # use this for merging
    mutate(seat_share_cmp = absseat / totseats) %>% # calculate seat share
    ungroup() %>% 
    group_by(countryname, partyname) %>% 
    mutate(seat_share_cmp_lag = lag(seat_share_cmp),
           absseat_lag = lag(absseat),
           totseats_lag = lag(totseats)) %>% 
    ungroup() 


# rename ParlGov variables to merge properly (through date and party)
parl_gov_complete_merge_clean <- parl_gov_complete_merge_clean %>% 
    mutate(date_parlgov = election) %>%
    mutate(cmp_id_final = as.character(cmp_id_final)) %>% 
    dplyr::select(-c(party, date)) %>% 
    unique() %>% 
    mutate(date = date_parlgov) # for merging


# nerge CMP with ParlGov data 
cmp_pg <- left_join(cmp_data, parl_gov_complete_merge_clean, 
                    by = c("date", "cmp_id_final")) 


# remove "per_..." salience variables
                                      # create a party-election identifier (party_election_id)
cmp_pg <- cmp_pg %>% 
    dplyr::select(-starts_with("per")) %>% 
    mutate(party_election_id = paste(party, date, sep = "_")) %>% 
    dplyr::select(cabinet_party, cabinet_party_lag, 
                  prime_minister, prime_minister_lag, 
                  everything()) # change the order of variables in the data frame


# change dates for merging for cases when two elections took place in one year
cmp_pg <- cmp_pg %>% 
    mutate(year_merge = ifelse(countryname == "Ireland" & edate == "1982-02-18", "1982-01",
                               ifelse(countryname == "Ireland" & edate == "1982-11-24", "1982-02", 
                                      ifelse(countryname == "United Kingdom" & edate == "1974-02-28", "1974-02",
                                             ifelse(countryname == "United Kingdom" & edate == "1974-10-10", "1974-10", year)))))


# load economic data and merge with cmp_pg_election_type
dat_economic <- read_csv("data_economic.csv") %>% 
    mutate(year = as.factor(year))

# merge economic data with merged CMP-Parlgov dataframe
cmp_pg_economic <- left_join(cmp_pg, dat_economic, 
                             by = c("year", "countryname"))

cmp_pg_economic <- cmp_pg_economic %>% 
    rename(election_date_cmp = edate)

# merge metadata with sentence/manifesto-level variables

dat_merged <- left_join(man_full, dplyr::select(cmp_pg_economic, -c(date, party)),
                        by = c("party_election_id"))

# filter only relevant countires for the analysis
dat_merged <- dat_merged %>% 
    filter(countryname %in% c("Germany", "Austria", 
                              "Ireland", "United Kingdom", 
                              "United States", "Australia",
                              "Switzerland", "New Zealand",
                              "Canada"))


# load Comparative Political Dataset and rename variables
cpds <- haven::read_dta("CPDS-1960-2016-Update-2018.dta") %>% 
    rename(countryname = country,
           inflation_cpds = inflation,
           nomgdpgr_cpds = nomgdpgr,
           realgdpgr_cpds = realgdpgr,
           unemp_cpds = unemp) %>% 
    mutate(year = as.character(year)) %>% 
    group_by(countryname) %>% 
    mutate(inflation_cpds_lag = lag(inflation_cpds),
           nomgdpgr_cpds_lag = lag(nomgdpgr_cpds),
           realgdpgr_cpds_lag = lag(realgdpgr_cpds),
           unemp_cpds_lag = lag(unemp_cpds)) %>% 
    dplyr::select(countryname, year, contains("cpds"))


dat_merged <- left_join(dat_merged, cpds, 
                        by = c("countryname", "year"))

length(unique(dat_merged$manifesto_id))
nrow(dat_merged)

# now we need to make sure that the cabinet_status and prime_minister
# data are correct

# get observations with missing information on cabinet_party and prime_minister
# (as well as cabinet_party_lag and prime_minister_lag)

dat_overview_cabinetstatus <- dat_merged %>% 
    select(countryname, partyname, party, election_date_cmp, edate,
           absseat, absseat_lag,
           cabinet_party, prime_minister, 
           cabinet_party_lag,
           prime_minister_lag) %>% 
    unique() %>% 
    group_by(party) %>% 
    mutate(first_election = min(election_date_cmp)) %>% 
    ungroup() %>% 
    arrange(countryname, party, edate) 

dat_missing_cabinetstatuts <- filter(dat_overview_cabinetstatus, is.na(cabinet_party_lag)) %>% 
    mutate(cabinet_party_lag_added = "",
           prime_minister_lag_added = "")
# rio::export(dat_missing_cabinetstatuts, "data_missing_cabinet_status.xlsx")
# save this file and manually input the missing information 
# cabinet_party_lag_added and prime_minister_lag_added

# I manually added the missing cabinet data and load file again
dat_cabinetstatus_added <- read_csv("data_missing_cabinet_status_added.csv")

# select only relevant variables
dat_cabinetstatus_added_select <- dat_cabinetstatus_added %>% 
    dplyr::select(countryname, party, edate,
                  contains("_added"))

# merge data on cabinet status with the manually added information
dat_overview_cabinetstatus <- left_join(dat_overview_cabinetstatus,
                                        dat_cabinetstatus_added_select,
                                        by = c("countryname", "party", "edate"))

# now create an indicator for the lagged cabinet status and prime_ministser status
# that uses the data from ...added if the original variable is NA

dat_overview_cabinetstatus <- dat_overview_cabinetstatus %>% 
    mutate(cabinet_party_lag_merged = ifelse(is.na(cabinet_party_lag),
                                             cabinet_party_lag_added,
                                             cabinet_party_lag)) %>% 
    mutate(prime_minister_lag_merged = ifelse(is.na(prime_minister_lag),
                                             prime_minister_lag_added,
                                             prime_minister_lag))

summary(dat_overview_cabinetstatus$cabinet_party_lag_merged)
summary(dat_overview_cabinetstatus$prime_minister_lag_merged)


# correct some errors from the ParlGov data

# recode 2015 Lib dems as cabinet party lag
dat_overview_cabinetstatus <- dat_overview_cabinetstatus %>% 
    mutate(cabinet_party_lag_merged = ifelse(party == "51421" & edate == 201505, 
                                              1, cabinet_party_lag_merged))

# recode FDP 2013 as cabinet party lag
dat_overview_cabinetstatus <- dat_overview_cabinetstatus %>% 
    mutate(cabinet_party_lag_merged = ifelse(party == "41420" & edate == 201309, 
                                              1, cabinet_party_lag_merged))

# recode FDP 2017 as not dat_combined_verbs party lag
dat_overview_cabinetstatus <- dat_overview_cabinetstatus %>% 
    mutate(cabinet_party_lag_merged = ifelse(party == "41420" & edate == 201709, 
                                              0, cabinet_party_lag_merged))


# recode Greens 2016 in Ireland not as cabinet party lag
dat_overview_cabinetstatus <- dat_overview_cabinetstatus %>% 
    mutate(cabinet_party_lag_merged = ifelse(party == "53110" & edate == 201602, 
                                              0, cabinet_party_lag_merged))

# recode Greens 2011 in Ireland as cabinet party lag
dat_overview_cabinetstatus <- dat_overview_cabinetstatus %>% 
    mutate(cabinet_party_lag_merged = ifelse(party == "53110" & edate == 201102, 
                                              1, cabinet_party_lag_merged))

                                      # create incumbency status variables
dat_overview_cabinetstatus$incumbency_status2 <- "Opposition"
dat_overview_cabinetstatus$incumbency_status2[dat_overview_cabinetstatus$cabinet_party_lag_merged==1] <- "Incumbent"

dat_overview_cabinetstatus$incumbency_status3 <- "Opposition"
dat_overview_cabinetstatus$incumbency_status3[dat_overview_cabinetstatus$cabinet_party_lag_merged==1] <- "Non-PM Incumbent"
dat_overview_cabinetstatus$incumbency_status3[dat_overview_cabinetstatus$prime_minister_lag_merged==1] <- "PM Incumbent"

dat_overview_cabinetstatus$incumbency_status4 <- dat_overview_cabinetstatus$incumbency_status3

dat_overview_cabinetstatus <- dat_overview_cabinetstatus %>% 
    mutate(incumbency_status4 = incumbency_status3) %>% 
    mutate(incumbency_status4 = ifelse(absseat_lag == 0 | is.na(absseat_lag),
           "Not in previous parliament", incumbency_status4))

# recode incumbency_status4 for the German Green party and the Left party 
# for four elections
# (party was basically renamed and therefore absseats_lag was NA)
# the other variables are not affected because the Greens were in opposition anyway
# results do not change when treating these parties as "new parties"

dat_overview_cabinetstatus <- dat_overview_cabinetstatus %>% 
    mutate(incumbency_status4 = ifelse(party == "41112" & edate == 199012, "Opposition", incumbency_status4)) %>% 
    mutate(incumbency_status4 = ifelse(party == "41113" & edate == 199410, "Opposition", incumbency_status4)) %>% 
    mutate(incumbency_status4 = ifelse(party == "41222" & edate == 200509, "Opposition", incumbency_status4)) %>% 
    mutate(incumbency_status4 = ifelse(party == "41223" & edate == 200909, "Opposition", incumbency_status4))


dat_overview_cabinetstatus$incumbency_status2_factor <- factor(dat_overview_cabinetstatus$incumbency_status2,
                                                       levels = c("Opposition", "Incumbent"))
dat_overview_cabinetstatus$incumbency_status3_factor <- factor(dat_overview_cabinetstatus$incumbency_status3,
                                                       levels = c("Opposition", "Non-PM Incumbent", "PM Incumbent"))
dat_overview_cabinetstatus$incumbency_status4_factor <- factor(dat_overview_cabinetstatus$incumbency_status4,
                                                       levels = c("Not in previous parliament", "Opposition", "Cabinet party", "PM Incumbent"))
# now remove unncessary variables from dat_merged and merge the 
# clean dataset with information on the government status

dat_merged_select <- dat_merged %>% 
    dplyr::select(-c(cabinet_party, cabinet_party_lag,
                     prime_minister, prime_minister_lag))

dat_overview_cabinetstatus_select <- dat_overview_cabinetstatus %>% 
    dplyr::select(c(countryname, party, edate, 
                  contains("cabinet_party"),
                  contains("prime_minister"),
                  contains("_factor")))

# merge the manifestos with the meta data on government status
dat_merged_final <- left_join(dat_merged_select, 
                              dat_overview_cabinetstatus_select,
                              by = c("party", "edate", "countryname"))


# combine the economic indicators 
# (if gdp_growth_lag is NA use nomgdpgr_cpds_lag)
# (if unemployment_lag is NA use unemp_cpds_lag)
# (if inflation_lag is NA use inflation_cpds_lag)

dat_merged_final <- dat_merged_final %>% 
    mutate(gdp_growth_lag = ifelse(is.na(gdp_growth_lag),
                                   nomgdpgr_cpds_lag, gdp_growth_lag)) %>% 
    mutate(inflation_lag = ifelse(is.na(inflation_lag),
                                   inflation_cpds_lag, inflation_lag)) %>% 
    mutate(unemployment_lag = ifelse(is.na(unemployment_lag),
                                  unemp_cpds_lag, unemployment_lag)) 


dat_merged_final <- dat_merged_final %>% 
    mutate(election_id = paste(countryname, date, sep = "_"))

# only include German and English manifestos (exclude a few French and Italian documents from Switzerland)
table(dat_merged_final$language)

dat_merged_final <- dat_merged_final %>% 
    filter(language %in% c("english", "german"))


# The National Party of Australia Manifesto from 1993 is incorrectly coded as "german"
dat_merged_final <- dat_merged_final %>% 
    mutate(language = ifelse(manifesto_id == "63810_199303", "english", language))


# save dataset
saveRDS(dat_merged_final, file = "data_merged.rds")

